library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load dataset (preprocessing code in 20_04_07_create_dataset.html)
# Clusterings created by WGCNA
clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)
# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
dataset$Module = clusterings$Module
# Update gene scores to new SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
gene_scores = dataset %>% mutate(ID = rownames(.)) %>%
left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`)) %>% pull(`gene-score`)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(gene.score = gene_scores)
rownames(dataset) = rownames_dataset
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)
rm(getinfo, mart, rownames_dataset, GO_annotations)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character
print(paste0('Removing ', sum(dataset$Module=='gray'), ' genes without cluster'))
## [1] "Removing 113 genes without cluster"
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor))
Using Module Membership variables instead of binary module membership
Not including p-value variables
Including a new variable with the absolute value of GS
Removing information from gray module (unclassified genes)
Objective variable: Binary label indicating if it’s in the SFARI dataset or not
new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=!is.na(gene.score)) %>%
dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']
rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables'))
## [1] "The final dataset contains 16034 observations and 58 variables"
rm(new_dataset)
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS in the middle of both groups
mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)
The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module
Mean Expression doesn’t seem to play an important role
I don’t know what the 2nd PC is capturing
# Mean Expression data
load('./../Data/preprocessed_data.RData')
## Registered S3 methods overwritten by 'Hmisc':
## method from
## [.labelled expss
## print.labelled expss
## as.data.frame.labelled expss
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)
4.91% of the observations are positive
print(table(dataset$SFARI))
##
## FALSE TRUE
## 15247 787
#cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method
Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set
Note: Even though our label is binary, I want to have representative samples for all SFARI scores in both the training and test data, so instead of pooling all the SFARI scores together and randomly selecting 80% of the samples, I’m going to create the positive set selecting 80% of each of the samples by score
set.seed(123)
positive_sample_balancing_SFARI_scores = function(p){
positive_train_idx = c()
positive_test_idx = c()
for(score in 1:3){
score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
score_idx = which(rownames(dataset) %in% score_genes)
score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
score_test_idx = score_idx[!score_idx %in% score_train_idx]
positive_train_idx = c(positive_train_idx, score_train_idx)
positive_test_idx = c(positive_test_idx, score_test_idx)
}
return(list('train' = sort(positive_train_idx), 'test' = sort(positive_test_idx)))
}
# 80% of the samples for the training set
p = 0.8
positive_idx = positive_sample_balancing_SFARI_scores(p)
positive_train_idx = positive_idx[['train']]
positive_test_idx = positive_idx[['test']]
negative_idx = which(!dataset$SFARI)
negative_train_idx = sort(sample(negative_idx, size=ceiling(p*length(negative_idx))))
negative_test_idx = negative_idx[!negative_idx %in% negative_train_idx] # This is overwritten below because of the subbsampling
train_set = dataset[c(positive_train_idx, negative_train_idx),]
test_set = dataset[c(positive_test_idx, negative_test_idx),] # This is overwritten below because of the subbsampling
rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx,
p, positive_sample_balancing_SFARI_scores)
Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations
Sample with replacement positive observations in train set
positive_obs = which(train_set$SFARI)
add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)
train_set = train_set[c(1:nrow(train_set), add_obs),]
rm(positive_obs, add_obs)
Under-sampling observations with negative SFARI labels
cat(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
'% of the Negative observations in the training set'))
## Keeping ~21% of the Negative observations in the training set
negative_obs = which(!train_set$SFARI)
keep_obs = sample(negative_obs, size=sum(train_set$SFARI))
train_set = train_set[c(keep_obs, which(train_set$SFARI)),]
# Add observations we removed from the training data to the test data
lost_obs = !rownames(dataset) %in% c(rownames(train_set), rownames(test_set))
cat(paste0('Adding ', sum(lost_obs), ' Negative samples to the test set from the ones we just removed from the training set'))
## Adding 9674 Negative samples to the test set from the ones we just removed from the training set
test_set = rbind(test_set, dataset[lost_obs,])
rm(negative_obs, keep_obs)
Label distribution in training set
cro(train_set$SFARI)
|  #Total | |
|---|---|
| Â train_set$SFARIÂ | |
| Â Â Â FALSEÂ | 2524 |
| Â Â Â TRUEÂ | 2524 |
|    #Total cases | 5048 |
Labels distribution in test set
cro(test_set$SFARI)
|  #Total | |
|---|---|
| Â test_set$SFARIÂ | |
| Â Â Â FALSEÂ | 12723 |
| Â Â Â TRUEÂ | 156 |
|    #Total cases | 12879 |
train_set$SFARI = train_set$SFARI %>% as.factor
fit = glm(SFARI~., data=train_set, family='binomial')
The features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable
Variance Inflation Factor (VIF) and correlation plot
# VIF
plot_data = data.frame('Feature' = car::vif(fit) %>% sort %>% names,
'VIF' = car::vif(fit) %>% sort %>% unname) %>%
mutate(outlier = VIF>10)
plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + scale_y_log10() +
geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + xlab('Model Features') + theme_minimal() +
theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))
# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6,
tl.pos = 'l', tl.col = '#666666')
rm(getinfo, mart, clusterings)
Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal
Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study
Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression
Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again
Notes:
Running the model multiple times to get more acurate measurements of its performance
Over-sampling positive samples in the training set to obtain a 1:1 class ratio
### DEFINE FUNCTIONS
# Create positive training set including all SFARI scores
positive_sample_balancing_SFARI_scores = function(p, seed){
set.seed(seed)
positive_train_idx = c()
for(score in 1:3){
score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
score_idx = which(rownames(dataset) %in% score_genes)
score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
positive_train_idx = c(positive_train_idx, score_train_idx)
}
return(positive_train_idx)
}
create_train_test_sets = function(p, over_sampling_fold, seed){
### CREATE POSITIVE TRAINING SET (balancing SFARI scores and over-sampling)
positive_train_idx = positive_sample_balancing_SFARI_scores(p, seed)
add_obs = sample(positive_train_idx, size = ceiling(over_sampling_fold*length(positive_train_idx)), replace=TRUE)
positive_train_idx = c(positive_train_idx, add_obs)
### CREATE NEGATIVE TRAINING SET
negative_idx = which(!dataset$SFARI)
negative_train_idx = sample(negative_idx, size = length(positive_train_idx))
### CREATE TRAIN AND TEST SET
train_set = dataset[sort(c(positive_train_idx, negative_train_idx)),]
test_set = dataset[-unique(c(positive_train_idx, negative_train_idx)),]
return(list('train_set' = train_set, 'test_set' = test_set))
}
run_model = function(p, over_sampling_fold, seed){
# Create train and test sets
train_test_sets = create_train_test_sets(p, over_sampling_fold, seed)
train_set = train_test_sets[['train_set']] %>% data.frame
test_set = train_test_sets[['test_set']]
# Train Model
train_set$SFARI = train_set$SFARI %>% as.factor
lambda_seq = 10^seq(1, -4, by = -.1)
set.seed(123)
fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = trainControl('cv', number = 10),
tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
# Predict labels in test set
predictions = fit %>% predict(test_set, type='prob')
preds = data.frame('ID'=rownames(test_set), 'prob'=predictions$`TRUE`) %>% mutate(pred = prob>0.5)
# Measure performance of the model
acc = mean(test_set$SFARI==preds$pred)
prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
pred_ROCR = prediction(preds$prob, test_set$SFARI)
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
# Extract coefficients from features
coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1,
'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}
### RUN MODEL
# Parameters
p = 0.8
over_sampling_fold = 3
n_iter = 100
seeds = 123:(123+n_iter-1)
# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)
for(seed in seeds){
# Run model
model_output = run_model(p, over_sampling_fold, seed)
# Update outputs
acc = c(acc, model_output[['acc']])
prec = c(prec, model_output[['prec']])
rec = c(rec, model_output[['rec']])
F1 = c(F1, model_output[['F1']])
AUC = c(AUC, model_output[['AUC']])
preds = model_output[['preds']]
coefs$coef = coefs$coef + model_output[['coefs']]
update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in%
preds$ID, c('prob','pred','n')] +
update_preds
}
coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)
rm(p, over_sampling_fold, seeds, update_preds, positive_sample_balancing_SFARI_scores, create_train_test_sets, run_model)
To summarise in a single value the predictions of the models:
prob = Average of all the probabilities
pred = 1 if prob>0.5, 0 otherwise
test_set = predictions %>% filter(n>0) %>% left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]
conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels',
prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
|  Label Prediction |  #Total | |||
|---|---|---|---|---|
| Â FALSEÂ | Â TRUEÂ | Â | ||
|  Actual Labels | ||||
| Â Â Â FALSEÂ | 10038 | 5209 | Â | 15247 |
| Â Â Â TRUEÂ | 297 | 490 | Â | 787 |
|    #Total cases | 10335 | 5699 |  | 16034 |
rm(conf_mat)
pred_ROCR = prediction(test_set$prob, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)
MTcor and absGS have a very small coefficient and Gene Significance has a negative coefficient
gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% left_join(assigned_module, by ='ID') %>%
mutate(Module = gsub('#','',Module))
coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% left_join(gene_corr_info, by = c('feature' = 'Module')) %>%
dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% summarise('SFARI_perc' = mean(SFARI)) %>%
arrange(desc(coef))
kable(coef_info %>% dplyr::select(feature, coef) %>% rename('Feature' = feature, 'Coefficient' = coef),
align = 'cc', caption = 'Regression Coefficients')
| Feature | Coefficient |
|---|---|
| 00A8FF | 0.7185317 |
| B79F00 | 0.5181573 |
| 8195FF | 0.4979092 |
| EE67EC | 0.4900560 |
| F8766D | 0.4773346 |
| 00BECF | 0.4050179 |
| BF80FF | 0.3426857 |
| 00C083 | 0.2772603 |
| FE61CF | 0.2509699 |
| 2EA2FF | 0.2411567 |
| 00BF74 | 0.2379291 |
| 619CFF | 0.2307058 |
| 00BFC4 | 0.2249593 |
| DB8E00 | 0.2247468 |
| 00B9E3 | 0.2191021 |
| FA62DA | 0.2113841 |
| 00ADFA | 0.2071333 |
| 00BC50 | 0.2070501 |
| 00C0B9 | 0.1970214 |
| 73B000 | 0.1921302 |
| FC727E | 0.1835976 |
| D39200 | 0.1652660 |
| EE8045 | 0.1572418 |
| FF66AA | 0.1432425 |
| CA9700 | 0.0990985 |
| A0A600 | 0.0951683 |
| 00BBDA | 0.0928223 |
| 00C19F | 0.0557449 |
| FF699C | 0.0447042 |
| FE6D8D | 0.0378138 |
| DB72FB | 0.0110478 |
| MTcor | -0.0156356 |
| 998EFF | -0.0189111 |
| absGS | -0.0476225 |
| F564E3 | -0.0574365 |
| 00B80F | -0.0628890 |
| ACA300 | -0.0716275 |
| CE79FF | -0.0770455 |
| GS | -0.0945027 |
| 84AD00 | -0.1000193 |
| FF61C3 | -0.1217880 |
| 00C092 | -0.1338882 |
| 93AA00 | -0.1387393 |
| F37B5A | -0.1415803 |
| 00BD63 | -0.1497667 |
| FF63B7 | -0.1650215 |
| AE87FF | -0.1673525 |
| 5EB300 | -0.1722345 |
| E56CF4 | -0.1812170 |
| E88526 | -0.2018223 |
| 41B500 | -0.2118645 |
| Intercept | -0.2310382 |
| 00BA38 | -0.2410595 |
| 00B6EC | -0.3506428 |
| C19B00 | -0.3714003 |
| 00B2F4 | -0.4233121 |
| E28900 | -0.4517536 |
| 00C1AC | -0.5237469 |
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
ggplot(aes(coef, SFARI_perc)) + geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) +
theme_minimal() + xlab('Coefficient') +
ylab('% of SFARI genes in Module'))
This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
ggplot(aes(coef, MTcor)) + geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) +
theme_minimal() + xlab('Coefficient') +
ylab('Module-Diagnosis correlation'))
SFARI genes have a higher score distribution than the rest, but the overlap is large
plot_data = test_set %>% dplyr::select(prob, SFARI)
ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
|  #Total | |
|---|---|
|  SFARI Gene score | |
| Â Â Â 1Â | 143 |
| Â Â Â 2Â | 205 |
| Â Â Â 3Â | 439 |
|    Neuronal | 933 |
|    Others | 14314 |
|    #Total cases | 16034 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))
ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI score') + ylab('Probability') + theme_minimal())
comparisons = list(c('1','Neuronal'), c('2','Neuronal'), c('3','Neuronal'), c('1','Others'), c('2','Others'),
c('3','Others'), c('1','2'), c('2','3'), c('Neuronal', 'Others'))
plot_data %>% ggplot(aes(gene.score, prob, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE),
label.y = c(.8, .75, .7, .95, .9, .85, rep(1, 3)), tip.length = 0.01) +
#label.y = c(.5, .45, .4, .7, .65, .6, rep(0.8, 3)), tip.length = 0.01) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI Score') + ylab('Probability') +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
theme_minimal() + theme(legend.position = 'none')
rm(mean_vals)
Genes with highest scores in test set
Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)
14/18 SFARI Genes in this list belong to the SFARI Score 1, 2 to Score 2 and only 1 to Score 3
test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor, 'GeneSignificance' = GS) %>%
mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4),
GeneSignificance = round(GeneSignificance,4)) %>%
dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability, gene.score) %>%
kable(caption = 'Genes with highest model probabilities from the test set')
| GeneSymbol | GeneSignificance | ModuleDiagnosis_corr | Module | Probability | gene.score |
|---|---|---|---|---|---|
| RPRD2 | -0.0922 | -0.5528 | #FE61CF | 0.8747 | Others |
| MBD5 | 0.1943 | 0.0586 | #B79F00 | 0.8734 | 1 |
| MYCBP2 | -0.3975 | -0.4939 | #8195FF | 0.8706 | Neuronal |
| EP300 | -0.0234 | 0.1127 | #00A8FF | 0.8682 | 1 |
| TANC2 | -0.2348 | -0.5528 | #FE61CF | 0.8665 | 1 |
| RAPH1 | -0.0534 | -0.4939 | #8195FF | 0.8571 | Others |
| KMT2A | 0.1347 | 0.5941 | #619CFF | 0.8551 | 1 |
| NFAT5 | 0.1035 | 0.0586 | #B79F00 | 0.8547 | Others |
| ANK2 | -0.4368 | -0.9355 | #00B9E3 | 0.8528 | 1 |
| PRDM2 | -0.2518 | 0.0586 | #B79F00 | 0.8520 | Others |
| RFX7 | 0.1372 | 0.0586 | #B79F00 | 0.8519 | Others |
| ARID1B | 0.2711 | 0.1127 | #00A8FF | 0.8499 | 1 |
| C20orf112 | 0.0314 | 0.1127 | #00A8FF | 0.8418 | Others |
| EP400 | -0.1671 | -0.5528 | #FE61CF | 0.8416 | 2 |
| SRRM4 | -0.4400 | -0.6877 | #BF80FF | 0.8416 | Others |
| FMNL1 | -0.2223 | -0.6877 | #BF80FF | 0.8404 | Others |
| ANK3 | -0.4814 | 0.0586 | #B79F00 | 0.8401 | 1 |
| HIVEP2 | 0.0165 | -0.9355 | #00B9E3 | 0.8393 | 1 |
| RNF111 | -0.2410 | 0.0586 | #B79F00 | 0.8380 | Others |
| CREBBP | 0.1431 | 0.1127 | #00A8FF | 0.8349 | 1 |
| HUNK | -0.3273 | -0.5100 | #F8766D | 0.8333 | Others |
| KMT2D | -0.3255 | -0.5528 | #FE61CF | 0.8324 | Others |
| HECTD4 | -0.3170 | -0.5528 | #FE61CF | 0.8320 | 1 |
| KCNJ6 | -0.1379 | -0.9355 | #00B9E3 | 0.8310 | Others |
| TLE3 | 0.4884 | 0.1127 | #00A8FF | 0.8308 | Others |
| GRIN2B | -0.2761 | -0.7497 | #FF66AA | 0.8296 | 1 |
| ASAP1 | -0.0896 | -0.6877 | #BF80FF | 0.8296 | Others |
| KIAA1109 | -0.1173 | -0.0094 | #00BECF | 0.8277 | Others |
| GATAD2B | -0.4221 | -0.5528 | #FE61CF | 0.8275 | Others |
| SRCAP | -0.3623 | -0.5528 | #FE61CF | 0.8245 | 1 |
| CELF2 | -0.0605 | -0.9355 | #00B9E3 | 0.8239 | Others |
| DROSHA | -0.4111 | -0.5528 | #FE61CF | 0.8235 | Others |
| SAP130 | -0.2482 | -0.5528 | #FE61CF | 0.8225 | Others |
| TP53INP2 | -0.2140 | -0.4946 | #2EA2FF | 0.8215 | Others |
| ARID1A | -0.1378 | -0.5528 | #FE61CF | 0.8213 | Others |
| DAAM1 | 0.1579 | -0.0094 | #00BECF | 0.8209 | Others |
| CACNG3 | -0.4689 | -0.6877 | #BF80FF | 0.8199 | Others |
| BAZ2A | 0.0534 | -0.0094 | #00BECF | 0.8189 | Others |
| PCLO | -0.1879 | 0.0586 | #B79F00 | 0.8188 | Others |
| KMT2E | 0.0723 | 0.5941 | #619CFF | 0.8176 | 2 |
| TLN2 | -0.4783 | -0.7497 | #FF66AA | 0.8169 | Others |
| ZNF804A | -0.2768 | -0.5100 | #F8766D | 0.8168 | 2 |
| RIMBP2 | 0.2615 | -0.0094 | #00BECF | 0.8167 | Others |
| DLGAP4 | -0.3307 | -0.5528 | #FE61CF | 0.8165 | Others |
| POGZ | 0.0391 | 0.1127 | #00A8FF | 0.8164 | 1 |
| ETV6 | -0.1418 | -0.6877 | #BF80FF | 0.8163 | Others |
| MED13L | 0.3795 | 0.0586 | #B79F00 | 0.8160 | 1 |
| AMPD3 | -0.2810 | -0.9355 | #00B9E3 | 0.8158 | Others |
| NAV1 | -0.1734 | -0.5528 | #FE61CF | 0.8154 | Neuronal |
| GABBR2 | -0.6249 | -0.9355 | #00B9E3 | 0.8151 | 3 |
Selecting the Negative samples in the test set
negative_set = test_set %>% filter(!SFARI)
negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(negative_set_table$pred)
|  #Total | |
|---|---|
|  Label Prediction | |
| Â Â Â FALSEÂ | 10038 |
| Â Â Â TRUEÂ | 5209 |
|    #Total cases | 15247 |
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
##
## 5209 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
ggtitle('Probability distribution of the Negative samples in the Test Set') +
theme_minimal()
There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)
The pattern is stronger in under-expressed genes
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() +
geom_smooth(method = 'loess', color = '#666666') +
geom_hline(yintercept = 0, color='gray', linetype='dashed') + xlab('Probability') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Probability and Gene Significance') + theme_minimal()
There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model
The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones
negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() +
geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) +
xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
theme_minimal()
Summarised version, plotting by module instead of by gene
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis (this is unexpected)
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>%
left_join(original_dataset, by='MTcor') %>%
dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())
There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') +
ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
theme_minimal()
rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob),
n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) +
geom_point(color=plot_data2$Module, alpha = 0.7) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) +
geom_smooth(method='lm', se=FALSE, color='gray') + theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)
There is a relation between probability and lfc, so it **IS*() capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')
plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
theme_minimal() + ggtitle('LFC vs model probability by gene')
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') +
ylim(c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))
p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())
grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')
rm(p1, p2)
The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)
predictions = test_set
save(predictions, dataset, file='./../Data/Ridge_model_robust_new_SFARI.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] expss_0.10.2 corrplot_0.84 MLmetrics_1.1.1 car_3.0-7
## [5] carData_3.0-3 ROCR_1.0-7 gplots_3.0.3 caret_6.0-86
## [9] lattice_0.20-41 biomaRt_2.40.5 ggpubr_0.2.5 magrittr_1.5
## [13] RColorBrewer_1.1-2 gridExtra_2.3 viridis_0.5.1 viridisLite_0.3.0
## [17] plotly_4.9.2 knitr_1.28 forcats_0.5.0 stringr_1.4.0
## [21] dplyr_0.8.5 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2
## [25] tibble_3.0.0 ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.4-0 plyr_1.8.6
## [5] lazyeval_0.2.2 splines_3.6.3
## [7] BiocParallel_1.18.1 crosstalk_1.1.0.1
## [9] GenomeInfoDb_1.20.0 digest_0.6.25
## [11] foreach_1.5.0 htmltools_0.4.0
## [13] gdata_2.18.0 fansi_0.4.1
## [15] checkmate_2.0.0 memoise_1.1.0
## [17] cluster_2.1.0 openxlsx_4.1.4
## [19] annotate_1.62.0 recipes_0.1.10
## [21] modelr_0.1.6 gower_0.2.1
## [23] matrixStats_0.56.0 prettyunits_1.1.1
## [25] jpeg_0.1-8.1 colorspace_1.4-1
## [27] blob_1.2.1 rvest_0.3.5
## [29] haven_2.2.0 xfun_0.12
## [31] crayon_1.3.4 RCurl_1.98-1.1
## [33] jsonlite_1.6.1 genefilter_1.66.0
## [35] survival_3.1-12 iterators_1.0.12
## [37] glue_1.3.2 gtable_0.3.0
## [39] zlibbioc_1.30.0 ipred_0.9-9
## [41] XVector_0.24.0 DelayedArray_0.10.0
## [43] shape_1.4.4 BiocGenerics_0.30.0
## [45] abind_1.4-5 scales_1.1.0
## [47] DBI_1.1.0 miniUI_0.1.1.1
## [49] Rcpp_1.0.4.6 xtable_1.8-4
## [51] progress_1.2.2 htmlTable_1.13.3
## [53] foreign_0.8-76 bit_1.1-15.2
## [55] Formula_1.2-3 stats4_3.6.3
## [57] lava_1.6.7 prodlim_2019.11.13
## [59] glmnet_3.0-2 htmlwidgets_1.5.1
## [61] httr_1.4.1 acepack_1.4.1
## [63] ellipsis_0.3.0 pkgconfig_2.0.3
## [65] XML_3.99-0.3 farver_2.0.3
## [67] nnet_7.3-14 dbplyr_1.4.2
## [69] locfit_1.5-9.4 later_1.0.0
## [71] tidyselect_1.0.0 labeling_0.3
## [73] rlang_0.4.5 reshape2_1.4.3
## [75] AnnotationDbi_1.46.1 munsell_0.5.0
## [77] cellranger_1.1.0 tools_3.6.3
## [79] cli_2.0.2 generics_0.0.2
## [81] RSQLite_2.2.0 broom_0.5.5
## [83] fastmap_1.0.1 evaluate_0.14
## [85] yaml_2.2.1 ModelMetrics_1.2.2.2
## [87] bit64_0.9-7 fs_1.4.0
## [89] zip_2.0.4 caTools_1.18.0
## [91] nlme_3.1-147 mime_0.9
## [93] ggExtra_0.9 xml2_1.2.5
## [95] compiler_3.6.3 rstudioapi_0.11
## [97] png_0.1-7 curl_4.3
## [99] e1071_1.7-3 ggsignif_0.6.0
## [101] reprex_0.3.0 geneplotter_1.62.0
## [103] stringi_1.4.6 highr_0.8
## [105] Matrix_1.2-18 vctrs_0.2.4
## [107] pillar_1.4.3 lifecycle_0.2.0
## [109] data.table_1.12.8 bitops_1.0-6
## [111] httpuv_1.5.2 GenomicRanges_1.36.1
## [113] latticeExtra_0.6-29 R6_2.4.1
## [115] promises_1.1.0 KernSmooth_2.23-17
## [117] rio_0.5.16 IRanges_2.18.3
## [119] codetools_0.2-16 MASS_7.3-51.6
## [121] gtools_3.8.2 assertthat_0.2.1
## [123] SummarizedExperiment_1.14.1 DESeq2_1.24.0
## [125] withr_2.1.2 S4Vectors_0.22.1
## [127] GenomeInfoDbData_1.2.1 mgcv_1.8-31
## [129] parallel_3.6.3 hms_0.5.3
## [131] grid_3.6.3 rpart_4.1-15
## [133] timeDate_3043.102 class_7.3-17
## [135] rmarkdown_2.1 pROC_1.16.2
## [137] shiny_1.4.0.2 base64enc_0.1-3
## [139] Biobase_2.44.0 lubridate_1.7.4